﻿// cd /projects/hsieh_project/proj_201809/code_1_data/
// qstata data_2_sales_ind_sum_add.do

di "Started at $S_DATE $S_TIME"

global rev_date: display %tdYYNNDD date("$S_DATE", "DMY")
di "${rev_date}"

global dir_proj "/projects/hsieh_project/proj_201809/"

global dir_data "${dir_proj}/data/"

global ds_out = "${dir_data}/sales_ind_sum_add"

global gl_perc = "10 5 1"
global gl_vgeo = "zipcode fips msa czone msa1983 msa1983cz"

//==============================================================================

use "${dir_data}/sales_ind_sum", clear


if "${gl_perc}"=="" {
	local l_perc = "10"
}
else {
	local l_perc = "${gl_perc}"
}

//==============================================================================
/* 
Determine the starting year of industry. Can be further improved 

In analysis involving sales, we use different years 
*/

/* REDACTED 
Sales data comes from the EC and for some industries the starting year of data is different
We typically try to use 1977 and 2012 as the start and end years for sales data to calculate 
differences etc. 
However, for some industries the start and end years would be different based on data availability: 
eg. if an industry has data from 1987 only then when calculating differences (eg. change in log sales shares etc.)
the differences will be calculated between these years

Code and comments related to these specific changes have been redacted */

sort sector ch_ind year

/* REDACTED */

// Additional check
by sector ch_ind: gen n_ind_f1 = n_ind[_n+1]
gen flag = 1 if n_ind_f1 / n_ind > 10 & n_ind_f1 < .

// Export industries with potential issues
preserve
by sector ch_ind: egen flag_tot = total(flag)
keep if flag_tot > 0
keep year sector ch_ind sales_ind n_ind flag
save "${dir_data}/cw_year_sales_flag", replace
restore

// Manually select years for some industries
// (This should be modified when sales data is improved)
/* REDACTED */

by sector ch_ind: egen year_min = min(year)
by sector ch_ind: egen year_max = max(year)

drop n_ind_f1

//==============================================================================
// Export a year-ind crosswalk 
// (which gives starting and ending years of sales data that we plan to use for each industry)

preserve
keep ch_ind year_min year_max
duplicates drop
save "${dir_data}/cw_year_sales", replace
saveold "${dir_data}/cw_year_sales_v12", replace v(12)
restore

//==============================================================================
// Generate variables at industry level; Merge with emp data

keep year* sector ch_ind *sales*

gen year_orig = year // Real years
keep if year == year_min | year == year_max
replace year = 2013 if year == year_max // to use 2013 emp data instead of 2012

merge 1:1 year sector ch_ind using ${dir_data}/ind_sum_all_add, keepus(n_ind ln_est_ind_r_* ln_*_ind_r2_* *emp* miss_*) // Employment data from original year
keep if _merge == 3
drop _merge

drop year
gen year = .
replace year = 1977 if year_orig == year_min
replace year = 2013 if year_orig == year_max
order year_orig year sector ch_ind

capture noi drop ln_*_rc_* ln_*_r2c*

gen ln_sales_ind = ln(sales_ind)

// Helper variables for deciles
gen sales_ind_100 = sales_ind
gen sales_ind_0 = 0

// Sales share
foreach li_perc in `l_perc' {
	local vi_perc = subinstr("`li_perc'", ".", "_", .)
	
	gen sales_miss_`vi_perc' = 1 if n_ind < 100/`li_perc'
	
	//-----------------------------
	// Top Firms
	gen saless_ind_`vi_perc' = sales_ind_`vi_perc' / sales_ind
	replace saless_ind_`vi_perc' = . if sales_miss_`vi_perc' == 1
	
	gen ln_saless_ind_`vi_perc' = ln(saless_ind_`vi_perc')
	replace ln_saless_ind_`vi_perc' = . if sales_miss_`vi_perc' == 1
    
	/* Intensive Margins */
	gen ln_est_ind_rc_`vi_perc' = ln_saless_ind_`vi_perc' - ln_est_ind_r_`vi_perc'
	foreach vgeo in $gl_vgeo {
		gen ln_`vgeo'_ind_r2c_`vi_perc' = ln_saless_ind_`vi_perc' - ln_`vgeo'_ind_r2_`vi_perc'
	}
	
	/*
	-------------------------------------
	Employment share of decile firms
	*/
	if `vi_perc' >= 10 {
	gen sales_ind_`vi_perc'd = sales_ind_`vi_perc' - sales_ind_`=`vi_perc'-10'
	gen saless_ind_`vi_perc'd = sales_ind_`vi_perc'd / sales_ind if sales_miss_`vi_perc' != 1
	gen ln_saless_ind_`vi_perc'd = ln(sales_ind_`vi_perc'd / sales_ind) if sales_miss_`vi_perc' != 1
	}

}

sort ch_ind year
save ${ds_out}, replace
/*
The final data set has two years: 1977 and 2013
These are not real years of the data for some industries
The real year is recorded in year_min for 1977 and year_max for 2013, or year_orig
*/

di "Ended at $S_DATE $S_TIME"
// End of do file
